MACS 40700
University of Chicago
20174018)2016-04-18, April 18th, 2017, etc.)lubridatelibrary(lubridate)ymd("2017-01-31")## [1] "2017-01-31"
mdy("January 31st, 2017")## [1] "2017-01-31"
dmy("31-Jan-2017")## [1] "2017-01-31"
(flights <- read_csv("data/flights-departed.csv"))## # A tibble: 7,671 × 2
## date value
## <date> <int>
## 1 1988-01-01 12681
## 2 1988-01-02 13264
## 3 1988-01-03 13953
## 4 1988-01-04 13921
## 5 1988-01-05 13932
## 6 1988-01-06 13157
## 7 1988-01-07 11159
## 8 1988-01-08 11631
## 9 1988-01-09 12045
## 10 1988-01-10 13160
## # ... with 7,661 more rows
ggplot(flights, aes(date, value)) +
geom_line() +
labs(x = NULL,
y = "Number of departing commercial flights")(flights <- flights %>%
mutate(year = year(date),
yday = yday(date),
# hack to label the x-axis with months
days = dmy(format(date,"%d-%m-2016"))))## # A tibble: 7,671 × 5
## date value year yday days
## <date> <int> <dbl> <dbl> <date>
## 1 1988-01-01 12681 1988 1 2016-01-01
## 2 1988-01-02 13264 1988 2 2016-01-02
## 3 1988-01-03 13953 1988 3 2016-01-03
## 4 1988-01-04 13921 1988 4 2016-01-04
## 5 1988-01-05 13932 1988 5 2016-01-05
## 6 1988-01-06 13157 1988 6 2016-01-06
## 7 1988-01-07 11159 1988 7 2016-01-07
## 8 1988-01-08 11631 1988 8 2016-01-08
## 9 1988-01-09 12045 1988 9 2016-01-09
## 10 1988-01-10 13160 1988 10 2016-01-10
## # ... with 7,661 more rows
ggplot(flights, aes(days, value)) +
geom_line(aes(group = year), alpha = .2) +
geom_smooth(se = FALSE) +
scale_x_date(labels = scales::date_format("%b")) +
labs(x = NULL,
y = "Number of departing commercial flights")(flights <- flights %>%
mutate(month = month(date, label = TRUE)))## # A tibble: 7,671 × 6
## date value year yday days month
## <date> <int> <dbl> <dbl> <date> <ord>
## 1 1988-01-01 12681 1988 1 2016-01-01 Jan
## 2 1988-01-02 13264 1988 2 2016-01-02 Jan
## 3 1988-01-03 13953 1988 3 2016-01-03 Jan
## 4 1988-01-04 13921 1988 4 2016-01-04 Jan
## 5 1988-01-05 13932 1988 5 2016-01-05 Jan
## 6 1988-01-06 13157 1988 6 2016-01-06 Jan
## 7 1988-01-07 11159 1988 7 2016-01-07 Jan
## 8 1988-01-08 11631 1988 8 2016-01-08 Jan
## 9 1988-01-09 12045 1988 9 2016-01-09 Jan
## 10 1988-01-10 13160 1988 10 2016-01-10 Jan
## # ... with 7,661 more rows
ggplot(flights, aes(month, value)) +
geom_violin() +
geom_boxplot(width = .1, outlier.shape = NA) +
labs(x = NULL,
y = "Number of departing commercial flights")value (number of departing flights)identitygeom_tile()facet_grid() (year X month)(flights <- flights %>%
mutate(weekday = wday(date, label = TRUE)))## # A tibble: 7,671 × 7
## date value year yday days month weekday
## <date> <int> <dbl> <dbl> <date> <ord> <ord>
## 1 1988-01-01 12681 1988 1 2016-01-01 Jan Fri
## 2 1988-01-02 13264 1988 2 2016-01-02 Jan Sat
## 3 1988-01-03 13953 1988 3 2016-01-03 Jan Sun
## 4 1988-01-04 13921 1988 4 2016-01-04 Jan Mon
## 5 1988-01-05 13932 1988 5 2016-01-05 Jan Tues
## 6 1988-01-06 13157 1988 6 2016-01-06 Jan Wed
## 7 1988-01-07 11159 1988 7 2016-01-07 Jan Thurs
## 8 1988-01-08 11631 1988 8 2016-01-08 Jan Fri
## 9 1988-01-09 12045 1988 9 2016-01-09 Jan Sat
## 10 1988-01-10 13160 1988 10 2016-01-10 Jan Sun
## # ... with 7,661 more rows
(flights <- flights %>%
# generate variables for week in the year (1-54) and the day in the year (1-366)
mutate(week = week(date),
yday = yday(date)) %>%
# normalize to draw calendar correctly - wday should represent the number of days from the Sunday of the week containing January 1st, then adjust based on that
group_by(year) %>%
mutate(yday = yday + wday(date)[1] - 2,
week = floor(yday / 7)) %>%
group_by(year, month) %>%
mutate(week_month = week - min(week) + 1))## Source: local data frame [7,671 x 9]
## Groups: year, month [252]
##
## date value year yday days month weekday week week_month
## <date> <int> <dbl> <dbl> <date> <ord> <ord> <dbl> <dbl>
## 1 1988-01-01 12681 1988 5 2016-01-01 Jan Fri 0 1
## 2 1988-01-02 13264 1988 6 2016-01-02 Jan Sat 0 1
## 3 1988-01-03 13953 1988 7 2016-01-03 Jan Sun 1 2
## 4 1988-01-04 13921 1988 8 2016-01-04 Jan Mon 1 2
## 5 1988-01-05 13932 1988 9 2016-01-05 Jan Tues 1 2
## 6 1988-01-06 13157 1988 10 2016-01-06 Jan Wed 1 2
## 7 1988-01-07 11159 1988 11 2016-01-07 Jan Thurs 1 2
## 8 1988-01-08 11631 1988 12 2016-01-08 Jan Fri 1 2
## 9 1988-01-09 12045 1988 13 2016-01-09 Jan Sat 1 2
## 10 1988-01-10 13160 1988 14 2016-01-10 Jan Sun 2 3
## # ... with 7,661 more rows
ggplot(flights, aes(weekday, week_month, fill = value)) +
facet_grid(year ~ month) +
geom_tile(color = "black") +
scale_fill_continuous(low = "green", high = "red") +
scale_x_discrete(labels = NULL) +
scale_y_reverse(labels = NULL) +
labs(title = "Domestic commercial flight activity",
x = NULL,
y = NULL,
fill = "Number of departing flights") +
theme_void() +
theme(legend.position = "bottom",
legend.text = element_text(angle = 45))p +
geom_smooth(method = "lm", se = FALSE)p +
geom_smooth(se = FALSE)r_plot <- function(r, n = 100){
xy <- ecodist::corgen(len = n, r = r) %>%
bind_cols
ggplot(xy, aes(x, y)) +
geom_point() +
ggtitle(str_c("Pearson's r = ", r))
}
r <- c(.8, 0, -.8)
for(r in r){
print(r_plot(r))
}pairs(select_if(credit, is.numeric))library(GGally)
ggpairs(select_if(credit, is.numeric))ggpairs(credit, mapping = aes(color = gender),
columns = c("income", "limit", "rating", "cards", "age", "education", "balance"))ggpairs(select_if(credit, is.numeric),
lower = list(
continuous = "smooth"
)
)ggpairs(select_if(credit, is.numeric),
lower = list(
continuous = wrap("smooth", alpha = .1, color = "blue")
)
)scatter_smooth <- function(data, mapping, ...) {
ggplot(data = data, mapping = mapping) +
# make data points transparent
geom_point(alpha = .2) +
# add default smoother
geom_smooth(se = FALSE)
}
ggpairs(select_if(credit, is.numeric),
lower = list(
continuous = scatter_smooth
)
)ggpairs(credit, mapping = aes(color = gender),
columns = c("income", "limit", "rating", "cards", "age", "education", "balance"),
lower = list(
continuous = scatter_smooth
)
)ggpairs(select(rcfss::scorecard, type:debt))(mpg_lite <- select_if(mpg, is.numeric))## # A tibble: 234 × 5
## displ year cyl cty hwy
## <dbl> <int> <int> <int> <int>
## 1 1.8 1999 4 18 29
## 2 1.8 1999 4 21 29
## 3 2.0 2008 4 20 31
## 4 2.0 2008 4 21 30
## 5 2.8 1999 6 16 26
## 6 2.8 1999 6 18 26
## 7 3.1 2008 6 18 27
## 8 1.8 1999 4 18 26
## 9 1.8 1999 4 16 25
## 10 2.0 2008 4 20 28
## # ... with 224 more rows
(cormat <- mpg_lite %>%
cor %>%
round(2))## displ year cyl cty hwy
## displ 1.00 0.15 0.93 -0.80 -0.77
## year 0.15 1.00 0.12 -0.04 0.00
## cyl 0.93 0.12 1.00 -0.81 -0.76
## cty -0.80 -0.04 -0.81 1.00 0.96
## hwy -0.77 0.00 -0.76 0.96 1.00
library(reshape2)
(melted_cormat <- melt(cormat))## Var1 Var2 value
## 1 displ displ 1.00
## 2 year displ 0.15
## 3 cyl displ 0.93
## 4 cty displ -0.80
## 5 hwy displ -0.77
## 6 displ year 0.15
## 7 year year 1.00
## 8 cyl year 0.12
## 9 cty year -0.04
## 10 hwy year 0.00
## 11 displ cyl 0.93
## 12 year cyl 0.12
## 13 cyl cyl 1.00
## 14 cty cyl -0.81
## 15 hwy cyl -0.76
## 16 displ cty -0.80
## 17 year cty -0.04
## 18 cyl cty -0.81
## 19 cty cty 1.00
## 20 hwy cty 0.96
## 21 displ hwy -0.77
## 22 year hwy 0.00
## 23 cyl hwy -0.76
## 24 cty hwy 0.96
## 25 hwy hwy 1.00
ggplot(melted_cormat, aes(x = Var1, y = Var2, fill = value)) +
geom_tile()# Get lower triangle of the correlation matrix
get_lower_tri<-function(cormat){
cormat[upper.tri(cormat)] <- NA
return(cormat)
}
# Get upper triangle of the correlation matrix
get_upper_tri <- function(cormat){
cormat[lower.tri(cormat)]<- NA
return(cormat)
}
upper_tri <- get_upper_tri(cormat)
upper_tri## displ year cyl cty hwy
## displ 1 0.15 0.93 -0.80 -0.77
## year NA 1.00 0.12 -0.04 0.00
## cyl NA NA 1.00 -0.81 -0.76
## cty NA NA NA 1.00 0.96
## hwy NA NA NA NA 1.00
melted_cormat <- melt(upper_tri, na.rm = TRUE)
ggplot(melted_cormat, aes(Var2, Var1, fill = value))+
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1)) +
coord_fixed()reorder_cormat <- function(cormat){
# Use correlation between variables as distance
dd <- as.dist((1-cormat)/2)
hc <- hclust(dd)
cormat <-cormat[hc$order, hc$order]
}
# Reorder the correlation matrix
cormat <- reorder_cormat(cormat)
upper_tri <- get_upper_tri(cormat)
# Melt the correlation matrix
melted_cormat <- melt(upper_tri, na.rm = TRUE)
# Create a ggheatmap
ggheatmap <- ggplot(melted_cormat, aes(Var2, Var1, fill = value))+
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+ # minimal theme
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1))+
coord_fixed()
# Print the heatmap
print(ggheatmap)ggheatmap +
geom_text(aes(Var2, Var1, label = value), color = "black", size = 4) +
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank(),
legend.position = "bottom")cormat_heatmap <- function(data){
# generate correlation matrix
cormat <- round(cor(data), 2)
# melt into a tidy table
get_upper_tri <- function(cormat){
cormat[lower.tri(cormat)]<- NA
return(cormat)
}
upper_tri <- get_upper_tri(cormat)
# reorder matrix based on coefficient value
reorder_cormat <- function(cormat){
# Use correlation between variables as distance
dd <- as.dist((1-cormat)/2)
hc <- hclust(dd)
cormat <-cormat[hc$order, hc$order]
}
cormat <- reorder_cormat(cormat)
upper_tri <- get_upper_tri(cormat)
# Melt the correlation matrix
melted_cormat <- melt(upper_tri, na.rm = TRUE)
# Create a ggheatmap
ggheatmap <- ggplot(melted_cormat, aes(Var2, Var1, fill = value))+
geom_tile(color = "white")+
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Pearson\nCorrelation") +
theme_minimal()+ # minimal theme
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 12, hjust = 1))+
coord_fixed()
# add correlation values to graph
ggheatmap +
geom_text(aes(Var2, Var1, label = value), color = "black", size = 4) +
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank(),
legend.position = "bottom")
}
cormat_heatmap(select_if(mpg, is.numeric))cormat_heatmap(select_if(credit, is.numeric))cormat_heatmap(select_if(diamonds, is.numeric))ggparcoord(data = iris, columns = 1:4, groupColumn = 5)# with the iris data, order the axes by overall class (Species) separation
# using the anyClass option
ggparcoord(data = iris, columns = 1:4, groupColumn = 5, order = "anyClass")# add points to the plot, add a title, and use an alpha scalar to make the
# lines transparent
p <- ggparcoord(data = iris, columns = 1:4, groupColumn = 5, order = "anyClass",
showPoints = TRUE, title = "Parallel Coordinate Plot for the Iris Data",
alphaLines = 0.3)
p# add some basic interactivity
ggplotly(p)## # A tibble: 1,317 × 5
## vote96 age educ female mhealth
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 60 12 0 0
## 2 1 36 12 0 1
## 3 0 21 13 0 7
## 4 0 29 13 0 6
## 5 1 39 18 1 2
## 6 1 41 15 1 1
## 7 1 48 20 0 2
## 8 0 20 12 1 9
## 9 0 27 11 1 9
## 10 0 34 7 1 2
## # ... with 1,307 more rows
## term estimate std.error statistic p.value
## 1 (Intercept) -5.0244 0.44482 -11.3 1.38e-29
## 2 age 0.0469 0.00441 10.6 1.94e-26
## 3 educ 0.2816 0.02629 10.7 9.32e-27
plot_ly(vote_prob, x = ~age, y = ~educ, z = ~prob) %>%
add_mesh()plot_ly(credit, x = ~limit, y = ~balance, z = ~income) %>%
add_mesh()plot_ly(z = ~volcano) %>% add_surface()volcano %>%
melt %>%
ggplot(aes(Var1, Var2, z = value)) +
geom_contour(aes(color = ..level..))